packages = c("readxl","ggplot2","tidyverse","dplyr","lubridate","lessR","MatchIt",
"cobalt", "eeptools","grf","tidymodels", "fastDummies")
# install packages not yet installed
installed_packages <- packages %in% rownames(installed.packages())
if (any(installed_packages == FALSE)) {
install.packages(packages[!installed_packages])
}
# packages loading
invisible(lapply(packages, library, character.only = TRUE))
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'lubridate'
##
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
##
##
## lessR 4.2.8 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
## Enter: browseVignettes("lessR")
##
## View changes in this and recent versions of lessR.
## Enter: news(package="lessR")
##
## Interactive data analysis.
## Enter: interact()
##
##
##
## Attaching package: 'lessR'
##
##
## The following objects are masked from 'package:dplyr':
##
## recode, rename
##
##
## cobalt (Version 4.5.1, Build Date: 2023-04-27)
##
##
## Attaching package: 'cobalt'
##
##
## The following object is masked from 'package:MatchIt':
##
## lalonde
##
##
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
##
## ✔ broom 1.0.1 ✔ rsample 1.1.0
## ✔ dials 1.0.0 ✔ tune 1.0.1
## ✔ infer 1.0.3 ✔ workflows 1.1.0
## ✔ modeldata 1.0.1 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.2 ✔ yardstick 1.1.0
## ✔ recipes 1.0.2
##
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lessR::recode() masks dplyr::recode()
## ✖ lessR::rename() masks dplyr::rename()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use tidymodels_prefer() to resolve common conflicts.
##
## Thank you for using fastDummies!
##
## To acknowledge our work, please cite the package:
##
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
msg_1 <- read_excel("~/Desktop/Monkee data/Messages_1 with userid.xlsx")
msg_2 <- read_excel("~/Desktop/Monkee data/messages_2_with userid.xlsx")
goals <- read_excel("~/Desktop/Monkee data/Monkee_goals.xlsx")
users <- read_excel("~/Desktop/Monkee data/Monkee_users.xlsx")
# merge the message datasets
messages <- rbind(msg_1, msg_2)
# count NAs and decide NAs in which column need to be removed
colSums(is.na(messages))
## user_id message_uuid message_type
## 0 1 1
## message_status send_at reacted_at
## 1 1376 1376
## message_title message_message message_action
## 1376 1376 1376
## message_id message_button_label_1 message_button_label_2
## 2751 2751 2755
## message_button_label_3 button_pressed
## 2757 453073
msg_subset <- messages[, c(2:13)]
clean_rows <- complete.cases(msg_subset)
messages_clean <- messages[clean_rows,]
colSums(is.na(messages_clean))
## user_id message_uuid message_type
## 0 0 0
## message_status send_at reacted_at
## 0 0 0
## message_title message_message message_action
## 0 0 0
## message_id message_button_label_1 message_button_label_2
## 0 0 0
## message_button_label_3 button_pressed
## 0 450322
sapply(messages_clean, function(x) n_distinct(x)) #5,008 users
## user_id message_uuid message_type
## 5008 641665 1
## message_status send_at reacted_at
## 2 239374 565468
## message_title message_message message_action
## 6 43078 2598
## message_id message_button_label_1 message_button_label_2
## 18 757 4
## message_button_label_3 button_pressed
## 3 167
colSums(is.na(goals))
## user_id goal_id starting_date end_date
## 0 1 1 1
## status archived_at target_amount week_target
## 1 16186 1 1
## starting_capital priority current_amount category
## 1 1 1 7517
## sub_category saving_mode goal_on_target
## 15277 1 1
goals <- goals[!is.na(goals$goal_id),]
table(goals$priority) # need to remove rows with a value of u=3, i (15 obs)
##
## HIGH NORMAL u=3, i
## 7476 19465 15
goals_clean <- goals[!goals$priority == 'u=3, i',]
colSums(is.na(goals_clean))
## user_id goal_id starting_date end_date
## 0 0 0 0
## status archived_at target_amount week_target
## 0 16176 0 0
## starting_capital priority current_amount category
## 0 0 0 7508
## sub_category saving_mode goal_on_target
## 15266 0 0
sapply(goals_clean, function(x) n_distinct(x)) #11,779 users #26,941 goals #14 categories
## user_id goal_id starting_date end_date
## 11779 26941 26940 3306
## status archived_at target_amount week_target
## 3 10764 1462 4981
## starting_capital priority current_amount category
## 3090 2 10125 14
## sub_category saving_mode goal_on_target
## 85 2 2
colSums(is.na(users))
## user_id date_of_birth gender
## 0 1 0
## joined_at country treatment_group
## 0 0 18094
## operating_system number_of_goals number_of_goals_active
## 8645 6314 6314
## number_of_goals_achieved number_of_goals_on_target
## 6314 6314
users_clean <- users[!is.na(users$number_of_goals),]
colSums(is.na(users_clean))
## user_id date_of_birth gender
## 0 0 0
## joined_at country treatment_group
## 0 0 11780
## operating_system number_of_goals number_of_goals_active
## 4782 0 0
## number_of_goals_achieved number_of_goals_on_target
## 0 0
sapply(users_clean, function(x) n_distinct(x)) #11,780 users
## user_id date_of_birth gender
## 11780 7365 2
## joined_at country treatment_group
## 11780 2 1
## operating_system number_of_goals number_of_goals_active
## 5 54 30
## number_of_goals_achieved number_of_goals_on_target
## 15 28
joint <- list(messages_clean, goals_clean, users_clean) %>%
reduce(inner_join, by = 'user_id') #213,326
sapply(joint, function(x) n_distinct(x)) #4,988 users #641,391 msgs #19,242 goals
## user_id message_uuid message_type
## 4988 641391 1
## message_status send_at reacted_at
## 2 239239 565312
## message_title message_message message_action
## 6 43050 2598
## message_id message_button_label_1 message_button_label_2
## 18 756 4
## message_button_label_3 button_pressed goal_id
## 3 167 19242
## starting_date end_date status
## 19241 3018 3
## archived_at target_amount week_target
## 10636 1208 4306
## starting_capital priority current_amount
## 3088 2 10073
## category sub_category saving_mode
## 14 85 2
## goal_on_target date_of_birth gender
## 2 4002 2
## joined_at country treatment_group
## 4988 2 1
## operating_system number_of_goals number_of_goals_active
## 5 54 30
## number_of_goals_achieved number_of_goals_on_target
## 15 28
multi_goal <- joint %>%
filter(number_of_goals > 1)
sapply(multi_goal, function(x) n_distinct(x)) #3,215 multi-goal users
## user_id message_uuid message_type
## 3215 464448 1
## message_status send_at reacted_at
## 2 169836 403461
## message_title message_message message_action
## 6 35954 1818
## message_id message_button_label_1 message_button_label_2
## 18 629 4
## message_button_label_3 button_pressed goal_id
## 3 144 17469
## starting_date end_date status
## 17468 2951 3
## archived_at target_amount week_target
## 10029 1143 4087
## starting_capital priority current_amount
## 2959 2 9513
## category sub_category saving_mode
## 14 85 2
## goal_on_target date_of_birth gender
## 2 2790 2
## joined_at country treatment_group
## 3215 2 1
## operating_system number_of_goals number_of_goals_active
## 5 53 30
## number_of_goals_achieved number_of_goals_on_target
## 15 28
single_goal <- joint %>%
filter(number_of_goals == 1)
sapply(single_goal, function(x) n_distinct(x)) #1,773 single-goal users
## user_id message_uuid message_type
## 1773 176943 1
## message_status send_at reacted_at
## 2 69836 165247
## message_title message_message message_action
## 6 11544 1471
## message_id message_button_label_1 message_button_label_2
## 16 466 4
## message_button_label_3 button_pressed goal_id
## 3 74 1773
## starting_date end_date status
## 1773 799 3
## archived_at target_amount week_target
## 608 258 911
## starting_capital priority current_amount
## 202 2 1005
## category sub_category saving_mode
## 14 65 2
## goal_on_target date_of_birth gender
## 2 1633 2
## joined_at country treatment_group
## 1773 2 1
## operating_system number_of_goals number_of_goals_active
## 5 1 2
## number_of_goals_achieved number_of_goals_on_target
## 2 2
table(single_goal$message_id) #message_id of interest: 2,3,4; 70,71,72; 73,74,75
##
## 1 2 3 4 5 6 7 58 62 70 71 72 73
## 20310 1082 4816 3558 11445 9957 10325 98601 12889 22 160 133 3316
## 74 75 78
## 239 89 1
sg_filtered <- single_goal[single_goal$message_id %in%
c('2', '3', '4',
'70', '71', '72',
'73', '74', '75'), ] %>%
filter(send_at >= as.POSIXct('2021-05-01 0:00:00'))
sapply(sg_filtered, function(x) n_distinct(x)) #1,112 users #9,583 msg
## user_id message_uuid message_type
## 1112 9583 1
## message_status send_at reacted_at
## 2 6911 9550
## message_title message_message message_action
## 2 1677 1399
## message_id message_button_label_1 message_button_label_2
## 9 419 1
## message_button_label_3 button_pressed goal_id
## 2 69 1112
## starting_date end_date status
## 1112 550 3
## archived_at target_amount week_target
## 285 183 632
## starting_capital priority current_amount
## 146 2 788
## category sub_category saving_mode
## 14 62 2
## goal_on_target date_of_birth gender
## 2 1050 2
## joined_at country treatment_group
## 1112 2 1
## operating_system number_of_goals number_of_goals_active
## 5 1 2
## number_of_goals_achieved number_of_goals_on_target
## 2 2
sg_filtered$viewed <- as.numeric(sg_filtered$message_status == 'VIEWED')
# control
sg_filtered$benchmark = sg_filtered$message_id == 2 |
sg_filtered$message_id == 3 | sg_filtered$message_id == 4
table(sg_filtered$benchmark) #5624
##
## FALSE TRUE
## 3959 5624
# absolute
sg_filtered$absolute = sg_filtered$message_id == 70 |
sg_filtered$message_id == 71 | sg_filtered$message_id == 72
table(sg_filtered$absolute) #315
##
## FALSE TRUE
## 9268 315
# percentage
sg_filtered$percentage = sg_filtered$message_id == 73 |
sg_filtered$message_id == 74 | sg_filtered$message_id == 75
table(sg_filtered$percentage) #3644
##
## FALSE TRUE
## 5939 3644
sg_filtered$treat = sg_filtered$benchmark
sg_filtered$treat[sg_filtered$benchmark] = "control"
sg_filtered$treat[sg_filtered$absolute] = "absolute"
sg_filtered$treat[sg_filtered$percentage] = "percentage"
sg_filtered$treat = as.factor(sg_filtered$treat)
sg_filtered$treat = relevel(sg_filtered$treat,
ref = "control")
# goal_length
sg_filtered$goal_length <-
lubridate::time_length(
difftime(as.Date(sg_filtered$end_date),
as.Date(sg_filtered$starting_date)),
"years")
# pick the end point as on the day after the experiment ended
# age
sg_filtered$age <- floor(age_calc(as.Date(sg_filtered$date_of_birth),
as.Date('2021-10-05'),
units = "years"))
# tenure
sg_filtered$tenure <- lubridate::time_length(difftime(as.Date('2021-10-05'),
as.Date(sg_filtered$joined_at)),
"years")
now that we have all the variables in hand, the next step is to pick important (pre-treatment) covariates for the analysis:
drop “goal_on_target” and “current_amount” as they are measured at the end of the experiment
# goal-related: goal_length, week_target, starting_capital, priority, category, saving_mode,
# user-related: gender, country, age, tenure
# convert categorical vars into correct data types
cat.var <- c('priority',
'category',
'saving_mode',
'gender',
'country')
cont.var <- c('goal_length',
'week_target',
'starting_capital',
'age',
'tenure')
sg_filtered[, cat.var] <- lapply(sg_filtered[, cat.var], factor, exclude = NULL)
# the cateogry variable contains NA values, however it should be treated as an unknown category instead
levels(sg_filtered$category)[match(NA, levels(sg_filtered$category))] <- "UNKNOWN"
# final df
final_df <- sg_filtered %>% dplyr::select('viewed',
'saving_response',
'treat',
'message_id',
cat.var,
cont.var) #9583
# make dummies for cat var
final_df = dummy_cols(final_df, select_columns = c("priority", "category", "saving_mode",
"gender", "country"))
final_df = final_df[, -c(5:9)]
# convert NAs in saving_response into 0
final_df["saving_response"][is.na(final_df["saving_response"])] <- 0
# additional df for saving_response (conditional on viewing): final_df_cond
final_df_cond <- final_df[final_df$viewed == 1,]
# control vs absolute
ca <- subset(final_df, treat %in% c('control','absolute')) #5939
ca$treat <- ifelse(ca$treat == "control", 0, 1)
# control vs percentage
cp <- subset(final_df, treat %in% c('control','percentage')) #9268
cp$treat <- ifelse(cp$treat == "control", 0, 1)
# absolute vs percentage
ap <- subset(final_df, treat %in% c('absolute','percentage')) #3959
ap$treat <- ifelse(ap$treat == "absolute", 0, 1)
# control vs absolute
ca_cond <- subset(final_df_cond, treat %in% c('control','absolute')) #1736
ca_cond$treat <- ifelse(ca_cond$treat == "control", 0, 1)
# control vs percentage
cp_cond <- subset(final_df_cond, treat %in% c('control','percentage')) #2471
cp_cond$treat <- ifelse(cp_cond$treat == "control", 0, 1)
# absolute vs percentage
ap_cond <- subset(final_df_cond, treat %in% c('absolute','percentage')) #971
ap_cond$treat <- ifelse(ap_cond$treat == "absolute", 0, 1)
W.ca = ca$treat
X.ca = ca[,c(5:31)]
m.ca = matchit(
as.formula(paste("W.ca ~ ", paste(colnames(X.ca), collapse = " + "))),
data = ca,
method = "nearest",
distance = "glm")
plot(summary(m.ca))
bal.tab(m.ca)
## Balance Measures
## Type Diff.Adj
## distance Distance 0.2078
## goal_length Contin. -0.0063
## week_target Contin. 0.0531
## starting_capital Contin. 0.0142
## age Contin. -0.1279
## tenure Contin. 0.0953
## priority_HIGH Binary -0.0571
## category_EDUCATION Binary 0.0000
## category_FINANCIAL_FREEDOM Binary -0.0095
## category_HEALTH Binary 0.0000
## category_HOME_ENTERTAINMENT Binary 0.0444
## category_HOME_LIVING Binary -0.0254
## category_HOUSEHOLD_APPLIANCES Binary 0.0000
## category_LEISURE_ENTERTAINMENT Binary 0.0000
## category_LIFE_EVENTS_PRESENTS Binary 0.0000
## category_LIFESTYLE Binary 0.0000
## category_MOBILITY Binary 0.0127
## category_RECURRING_COSTS Binary -0.0349
## category_SPORTS_EQUIPMENT Binary 0.0000
## category_TRAVEL_HOLIDAY Binary -0.0317
## category_UNKNOWN Binary 0.0444
## saving_mode_AUTOPILOT Binary -0.0571
## gender_FEMALE Binary -0.0286
## country_AT Binary 0.0413
##
## Sample sizes
## Control Treated
## All 5624 315
## Matched 315 315
## Unmatched 5309 0
W.cp = cp$treat
X.cp = cp[,c(5:31)]
m.cp = matchit(
as.formula(paste("W.cp ~ ", paste(colnames(X.cp), collapse = " + "))),
data = cp,
method = "full",
distance = "glm")
plot(summary(m.cp))
bal.tab(m.cp)
## Balance Measures
## Type Diff.Adj
## distance Distance 0.0020
## goal_length Contin. 0.0073
## week_target Contin. -0.0780
## starting_capital Contin. -0.0763
## age Contin. 0.0012
## tenure Contin. 0.0061
## priority_HIGH Binary -0.1075
## category_EDUCATION Binary -0.0072
## category_FINANCIAL_FREEDOM Binary 0.0202
## category_HEALTH Binary -0.0003
## category_HOME_ENTERTAINMENT Binary -0.0130
## category_HOME_LIVING Binary -0.0682
## category_HOUSEHOLD_APPLIANCES Binary 0.0005
## category_LEISURE_ENTERTAINMENT Binary 0.0053
## category_LIFE_EVENTS_PRESENTS Binary 0.0062
## category_LIFESTYLE Binary -0.0030
## category_MOBILITY Binary -0.0203
## category_RECURRING_COSTS Binary 0.0001
## category_SPORTS_EQUIPMENT Binary 0.0092
## category_TRAVEL_HOLIDAY Binary 0.0642
## category_UNKNOWN Binary 0.0064
## saving_mode_AUTOPILOT Binary -0.0582
## gender_FEMALE Binary -0.0231
## country_AT Binary 0.0368
##
## Sample sizes
## Control Treated
## All 5624. 3644
## Matched (ESS) 186.53 3644
## Matched (Unweighted) 5624. 3644
W.ap = ap$treat
X.ap = ap[,c(5:31)]
m.ap = matchit(
as.formula(paste("W.ap ~ ", paste(colnames(X.ap), collapse = " + "))),
data = ap,
method = "full",
distance = "glm")
plot(summary(m.ap))
bal.tab(m.ap)
## Balance Measures
## Type Diff.Adj
## distance Distance 0.0009
## goal_length Contin. -0.1134
## week_target Contin. 0.2164
## starting_capital Contin. -0.0218
## age Contin. -0.0496
## tenure Contin. 0.4342
## priority_HIGH Binary 0.0172
## category_EDUCATION Binary 0.0296
## category_FINANCIAL_FREEDOM Binary -0.0243
## category_HEALTH Contin. 0.0000
## category_HOME_ENTERTAINMENT Binary 0.0147
## category_HOME_LIVING Binary 0.0081
## category_HOUSEHOLD_APPLIANCES Binary 0.0063
## category_LEISURE_ENTERTAINMENT Binary 0.0074
## category_LIFE_EVENTS_PRESENTS Binary -0.0307
## category_LIFESTYLE Binary -0.0052
## category_MOBILITY Binary 0.0056
## category_RECURRING_COSTS Binary 0.0019
## category_SPORTS_EQUIPMENT Binary -0.0071
## category_TRAVEL_HOLIDAY Binary -0.0140
## category_UNKNOWN Binary 0.0077
## saving_mode_AUTOPILOT Binary -0.1313
## gender_FEMALE Binary -0.0245
## country_AT Binary 0.0182
##
## Sample sizes
## Control Treated
## All 315. 3644
## Matched (ESS) 44.43 3644
## Matched (Unweighted) 315. 3644
W.ca.cond = ca_cond$treat
X.ca.cond = ca_cond[,c(5:31)]
m.ca.cond = matchit(
as.formula(paste("W.ca.cond ~ ", paste(colnames(X.ca.cond), collapse = " + "))),
data = ca_cond,
method = "full",
distance = "glm")
plot(summary(m.ca.cond))
bal.tab(m.ca.cond)
## Balance Measures
## Type Diff.Adj
## distance Distance 0.0105
## goal_length Contin. -0.0283
## week_target Contin. 0.0757
## starting_capital Contin. -0.0160
## age Contin. 0.1861
## tenure Contin. -0.2235
## priority_HIGH Binary -0.0763
## category_EDUCATION Binary -0.0009
## category_FINANCIAL_FREEDOM Binary 0.0039
## category_HEALTH Contin. 0.0000
## category_HOME_ENTERTAINMENT Binary 0.0059
## category_HOME_LIVING Binary -0.0001
## category_HOUSEHOLD_APPLIANCES Binary -0.0008
## category_LEISURE_ENTERTAINMENT Binary 0.0022
## category_LIFE_EVENTS_PRESENTS Binary 0.0039
## category_LIFESTYLE Binary 0.0010
## category_MOBILITY Binary -0.0512
## category_RECURRING_COSTS Binary 0.0023
## category_SPORTS_EQUIPMENT Binary -0.0022
## category_TRAVEL_HOLIDAY Binary -0.0047
## category_UNKNOWN Binary 0.0408
## saving_mode_AUTOPILOT Binary 0.0127
## gender_FEMALE Binary 0.0239
## country_AT Binary 0.0111
##
## Sample sizes
## Control Treated
## All 1618. 118
## Matched (ESS) 116.73 118
## Matched (Unweighted) 1618. 118
W.cp.cond = cp_cond$treat
X.cp.cond = cp_cond[,c(5:31)]
m.cp.cond = matchit(
as.formula(paste("W.cp.cond ~ ", paste(colnames(X.cp.cond), collapse = " + "))),
data = cp_cond,
method = "full",
distance = "glm")
plot(summary(m.cp.cond))
bal.tab(m.cp.cond)
## Balance Measures
## Type Diff.Adj
## distance Distance -0.0006
## goal_length Contin. 0.0472
## week_target Contin. 0.0974
## starting_capital Contin. 0.0719
## age Contin. -0.0676
## tenure Contin. 0.0074
## priority_HIGH Binary -0.0140
## category_EDUCATION Binary 0.0089
## category_FINANCIAL_FREEDOM Binary -0.0019
## category_HEALTH Contin. 0.0000
## category_HOME_ENTERTAINMENT Binary -0.0005
## category_HOME_LIVING Binary -0.0133
## category_HOUSEHOLD_APPLIANCES Binary -0.0109
## category_LEISURE_ENTERTAINMENT Binary 0.0069
## category_LIFE_EVENTS_PRESENTS Binary -0.0011
## category_LIFESTYLE Binary 0.0052
## category_MOBILITY Binary -0.0171
## category_RECURRING_COSTS Binary -0.0014
## category_SPORTS_EQUIPMENT Binary 0.0036
## category_TRAVEL_HOLIDAY Binary -0.0518
## category_UNKNOWN Binary 0.0735
## saving_mode_AUTOPILOT Binary -0.0482
## gender_FEMALE Binary -0.0323
## country_AT Binary -0.0133
##
## Sample sizes
## Control Treated
## All 1618. 853
## Matched (ESS) 143.47 853
## Matched (Unweighted) 1618. 853
W.ap.cond = ap_cond$treat
X.ap.cond = ap_cond[,c(5:31)]
m.ap.cond = matchit(
as.formula(paste("W.ap.cond ~ ", paste(colnames(X.ap.cond), collapse = " + "))),
data = ap_cond,
method = "full",
distance = "glm")
plot(summary(m.ap.cond))
bal.tab(m.ap.cond)
## Balance Measures
## Type Diff.Adj
## distance Distance 0.0476
## goal_length Contin. -0.0533
## week_target Contin. 0.1337
## starting_capital Contin. 0.0790
## age Contin. 0.1324
## tenure Contin. 0.1167
## priority_HIGH Binary -0.0788
## category_EDUCATION Binary 0.0176
## category_FINANCIAL_FREEDOM Binary -0.0539
## category_HEALTH Contin. 0.0000
## category_HOME_ENTERTAINMENT Binary 0.0434
## category_HOME_LIVING Binary -0.0234
## category_HOUSEHOLD_APPLIANCES Binary 0.0223
## category_LEISURE_ENTERTAINMENT Binary 0.0094
## category_LIFE_EVENTS_PRESENTS Binary -0.0152
## category_LIFESTYLE Binary 0.0094
## category_MOBILITY Binary 0.0290
## category_RECURRING_COSTS Binary 0.0012
## category_SPORTS_EQUIPMENT Binary 0.0305
## category_TRAVEL_HOLIDAY Binary -0.1028
## category_UNKNOWN Binary 0.0328
## saving_mode_AUTOPILOT Binary 0.0853
## gender_FEMALE Binary -0.0625
## country_AT Binary 0.0695
##
## Sample sizes
## Control Treated
## All 118. 853
## Matched (ESS) 22.04 853
## Matched (Unweighted) 118. 853
ca_matched <- match.data(m.ca)
cp_matched <- match.data(m.cp)
ap_matched <- match.data(m.ap)
ca_cond_matched <- match.data(m.ca.cond)
cp_cond_matched <- match.data(m.cp.cond)
ap_cond_matched <- match.data(m.ap.cond)
table(ca_matched$treat)
##
## 0 1
## 315 315
table(cp_matched$treat)
##
## 0 1
## 5624 3644
table(ap_matched$treat)
##
## 0 1
## 315 3644
table(ca_cond_matched$treat)
##
## 0 1
## 1618 118
table(cp_cond_matched$treat)
##
## 0 1
## 1618 853
table(ap_cond_matched$treat)
##
## 0 1
## 118 853
# set seed & trees
myseed = 1234
ntrees = 500
# test-train split
ca_split <- initial_split(ca_matched, prop = 0.5)
ca_train <- training(ca_split) #315
ca_test <- testing(ca_split) #315
X.ca.train = ca_train[, c(5:31)]
X.ca.test = ca_test[, c(5:31)]
Y.ca.train.v = ca_train$viewed
Y.ca.train.s = ca_train$saving_response
W.ca.train = ca_train$treat
# m(X) = E[Y | X]
Y.ca.forest.v = regression_forest(X.ca.train, Y.ca.train.v, num.trees = ntrees)
Y.ca.hat.v = predict(Y.ca.forest.v, X.ca.test)$predictions
Y.ca.forest.s = regression_forest(X.ca.train, Y.ca.train.s, num.trees = ntrees)
Y.ca.hat.s = predict(Y.ca.forest.s, X.ca.test)$predictions
# propensity score E[W | X]
W.ca.forest = regression_forest(X.ca.train, W.ca.train, num.trees = ntrees)
W.ca.hat = predict(W.ca.forest, X.ca.test)$predictions
# cf estimation
cf_ca_v <- causal_forest(
X.ca.train,
Y.ca.train.v,
W.ca.train,
Y.hat = Y.ca.hat.v,
W.hat = W.ca.hat,
tune.parameters = "all",
seed = myseed
)
cf_ca_s <- causal_forest(
X.ca.train,
Y.ca.train.s,
W.ca.train,
Y.hat = Y.ca.hat.s,
W.hat = W.ca.hat,
tune.parameters = "all",
seed = myseed
)
# out-of-bag predictions
tau_hat_ca_v = predict(cf_ca_v, X.ca.test)$predictions
tau_hat_ca_s = predict(cf_ca_s, X.ca.test)$predictions
# plot tau.hat
plot(density(tau_hat_ca_v),
main = "Predicted view rate CATEs: control vs. absolute")
plot(density(tau_hat_ca_s),
main = "Predicted saving rate CATEs: control vs. absolute")
# ATE
ca_ATE_ps_v = average_treatment_effect(cf_ca_v)
paste("95% CI for the view rate ATE:", round(ca_ATE_ps_v[1], 3),
"+/-", round(qnorm(0.975) * ca_ATE_ps_v[2], 3))
## [1] "95% CI for the view rate ATE: 0.054 +/- 0.131"
ca_ATE_ps_s = average_treatment_effect(cf_ca_s)
paste("95% CI for the saving rate ATE:", round(ca_ATE_ps_s[1], 3),
"+/-", round(qnorm(0.975) * ca_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: -0.06 +/- 0.085"
ca_ATE_ps_v
## estimate std.err
## 0.05418335 0.06703059
ca_ATE_ps_s
## estimate std.err
## -0.06033495 0.04332149
for the conditional df:
# test-train split
ca_cond_split <- initial_split(ca_cond_matched, prop = 0.5)
ca_cond_train <- training(ca_cond_split) #868
ca_cond_test <- testing(ca_cond_split) #868
X.ca.cond.train = ca_cond_train[, c(5:31)]
X.ca.cond.test = ca_cond_test[, c(5:31)]
Y.ca.cond.train.s = ca_cond_train$saving_response
W.ca.cond.train = ca_cond_train$treat
# m(X) = E[Y | X]
Y.ca.cond.forest.s = regression_forest(X.ca.cond.train, Y.ca.cond.train.s, num.trees = ntrees)
Y.ca.cond.hat.s = predict(Y.ca.cond.forest.s, X.ca.cond.test)$predictions
# propensity score E[W | X]
W.ca.cond.forest = regression_forest(X.ca.cond.train, W.ca.cond.train, num.trees = ntrees)
W.ca.cond.hat = predict(W.ca.cond.forest, X.ca.cond.test)$predictions
# cf estimation
cf_ca_cond_s <- causal_forest(
X.ca.cond.train,
Y.ca.cond.train.s,
W.ca.cond.train,
Y.hat = Y.ca.cond.hat.s,
W.hat = W.ca.cond.hat,
tune.parameters = "all",
seed = myseed
)
# out-of-bag predictions
tau_hat_ca_cond_s = predict(cf_ca_cond_s, X.ca.cond.test)$predictions
# plot tau.hat
plot(density(tau_hat_ca_cond_s),
main = "Predicted conditional saving rate CATEs: control vs. absolute")
# save rate ATE conditional on viewing
ca_cond_ATE_ps_s = average_treatment_effect(cf_ca_cond_s)
paste("95% CI for the conditional saving rate ATE:", round(ca_cond_ATE_ps_s[1], 3),
"+/-", round(qnorm(0.975) * ca_cond_ATE_ps_s[2], 3))
## [1] "95% CI for the conditional saving rate ATE: 0.11 +/- 0.154"
ca_cond_ATE_ps_s
## estimate std.err
## 0.10958988 0.07849574
# test-train split
cp_split <- initial_split(cp_matched, prop = 0.5)
cp_train <- training(cp_split) #4634
cp_test <- testing(cp_split) #4634
X.cp.train = cp_train[, c(5:31)]
X.cp.test = cp_test[, c(5:31)]
Y.cp.train.v = cp_train$viewed
Y.cp.train.s = cp_train$saving_response
W.cp.train = cp_train$treat
# m(X) = E[Y | X]
Y.cp.forest.v = regression_forest(X.cp.train, Y.cp.train.v, num.trees = ntrees)
Y.cp.hat.v = predict(Y.cp.forest.v, X.cp.test)$predictions
Y.cp.forest.s = regression_forest(X.cp.train, Y.cp.train.s, num.trees = ntrees)
Y.cp.hat.s = predict(Y.cp.forest.s, X.cp.test)$predictions
# propensity score E[W | X]
W.cp.forest = regression_forest(X.cp.train, W.cp.train, num.trees = ntrees)
W.cp.hat = predict(W.cp.forest, X.cp.test)$predictions
# cf estimation
cf_cp_v <- causal_forest(
X.cp.train,
Y.cp.train.v,
W.cp.train,
Y.hat = Y.cp.hat.v,
W.hat = W.cp.hat,
tune.parameters = "all",
seed = myseed
)
cf_cp_s <- causal_forest(
X.cp.train,
Y.cp.train.s,
W.cp.train,
Y.hat = Y.cp.hat.s,
W.hat = W.cp.hat,
tune.parameters = "all",
seed = myseed
)
# out-of-bag predictions
tau_hat_cp_v = predict(cf_cp_v, X.cp.test)$predictions
tau_hat_cp_s = predict(cf_cp_s, X.cp.test)$predictions
# plot tau.hat
plot(density(tau_hat_cp_v),
main = "Predicted view rate CATEs: control vs. percentage")
plot(density(tau_hat_cp_s),
main = "Predicted saving rate CATEs: control vs. percentage")
# ATE
cp_ATE_ps_v = average_treatment_effect(cf_cp_v)
paste("95% CI for the view rate ATE:", round(cp_ATE_ps_v[1], 3),
"+/-", round(qnorm(0.975) * cp_ATE_ps_v[2], 3))
## [1] "95% CI for the view rate ATE: -0.316 +/- 0.129"
cp_ATE_ps_s = average_treatment_effect(cf_cp_s)
paste("95% CI for the saving rate ATE:", round(cp_ATE_ps_s[1], 3),
"+/-", round(qnorm(0.975) * cp_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: -0.037 +/- 0.055"
cp_ATE_ps_v
## estimate std.err
## -0.31636565 0.06596747
cp_ATE_ps_s
## estimate std.err
## -0.03749626 0.02823887
for the conditional df:
# test-train split
cp_cond_split <- initial_split(cp_cond_matched, prop = 0.5)
cp_cond_train <- training(cp_cond_split) #1235
cp_cond_test <- testing(cp_cond_split) #1236
X.cp.cond.train = cp_cond_train[, c(5:31)]
X.cp.cond.test = cp_cond_test[, c(5:31)]
Y.cp.cond.train.s = cp_cond_train$saving_response
W.cp.cond.train = cp_cond_train$treat
# cf estimation
cf_cp_cond_s <- causal_forest(
X.cp.cond.train,
Y.cp.cond.train.s,
W.cp.cond.train,
tune.parameters = "all",
seed = myseed
)
# out-of-bag predictions
tau_hat_cp_cond_s = predict(cf_cp_cond_s, X.cp.cond.test)$predictions
# plot tau.hat
plot(density(tau_hat_cp_cond_s),
main = "Predicted conditional saving rate CATEs: control vs. percentage")
# save rate ATE conditional on viewing
cp_cond_ATE_ps_s = average_treatment_effect(cf_cp_cond_s)
paste("95% CI for the saving rate ATE:", round(cp_cond_ATE_ps_s[1], 3),
"+/-", round(qnorm(0.975) * cp_cond_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: -0.098 +/- 0.042"
cp_cond_ATE_ps_s
## estimate std.err
## -0.09837554 0.02165855
# test-train split
ap_split <- initial_split(ap_matched, prop = 0.5)
ap_train <- training(ap_split) #1979
ap_test <- testing(ap_split) #1980
X.ap.train = ap_train[, c(5:31)]
X.ap.test = ap_test[, c(5:31)]
Y.ap.train.v = ap_train$viewed
Y.ap.train.s = ap_train$saving_response
W.ap.train = ap_train$treat
# cf estimation
cf_ap_v <- causal_forest(
X.ap.train,
Y.ap.train.v,
W.ap.train,
tune.parameters = "all",
seed = myseed
)
cf_ap_s <- causal_forest(
X.ap.train,
Y.ap.train.s,
W.ap.train,
tune.parameters = "all",
seed = myseed
)
# out-of-bag predictions
tau_hat_ap_v = predict(cf_ap_v, X.ap.test)$predictions
tau_hat_ap_s = predict(cf_ap_s, X.ap.test)$predictions
# plot tau.hat
plot(density(tau_hat_ap_v),
main = "Predicted view rate CATEs: absolute vs. percentage")
plot(density(tau_hat_ap_s),
main = "Predicted saving rate CATEs: absolute vs. percentage")
# ATE
ap_ATE_ps_v = average_treatment_effect(cf_ap_v)
paste("95% CI for the view rate ATE:", round(ap_ATE_ps_v[1], 3),
"+/-", round(qnorm(0.975) * ap_ATE_ps_v[2], 3))
## [1] "95% CI for the view rate ATE: NaN +/- NaN"
ap_ATE_ps_s = average_treatment_effect(cf_ap_s)
paste("95% CI for the saving rate ATE:", round(ap_ATE_ps_s[1], 3),
"+/-", round(qnorm(0.975) * ap_ATE_ps_s[2], 3))
## [1] "95% CI for the saving rate ATE: NaN +/- NaN"
ap_ATE_ps_v
## estimate std.err
## NaN NaN
ap_ATE_ps_s
## estimate std.err
## NaN NaN
for the conditional df:
# test-train split
ap_cond_split <- initial_split(ap_cond_matched, prop = 0.5)
ap_cond_train <- training(ap_cond_split) #485
ap_cond_test <- testing(ap_cond_split) #486
X.ap.cond.train = ap_cond_train[, c(5:31)]
X.ap.cond.test = ap_cond_test[, c(5:31)]
Y.ap.cond.train.s = ap_cond_train$saving_response
W.ap.cond.train = ap_cond_train$treat
# cf estimation
cf_ap_cond_s <- causal_forest(
X.ap.cond.train,
Y.ap.cond.train.s,
W.ap.cond.train,
tune.parameters = "all",
seed = myseed
)
# out-of-bag predictions
tau_hat_ap_cond_s = predict(cf_ap_cond_s, X.ap.cond.test)$predictions
# plot tau.hat
plot(density(tau_hat_ap_cond_s),
main = "Predicted conditional saving rate CATEs: absolute vs. percentage")
# save rate ATE conditional on viewing
ap_cond_ATE_ps_s = average_treatment_effect(cf_ap_cond_s)
paste("95% CI for the conditional saving rate ATE:", round(ap_cond_ATE_ps_s[1], 3),
"+/-", round(qnorm(0.975) * ap_cond_ATE_ps_s[2], 3))
## [1] "95% CI for the conditional saving rate ATE: -0.136 +/- 0.112"
ap_cond_ATE_ps_s
## estimate std.err
## -0.13601824 0.05738161
# BLP test for heterogeneity
test_calibration(cf_cp_v, vcov.type = "HC3")
##
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
##
## Estimate Std. Error t value
## mean.forest.prediction 1.202439 0.140213 8.5758
## differential.forest.prediction 1.339288 0.076061 17.6081
## Pr(>t)
## mean.forest.prediction < 0.00000000000000022 ***
## differential.forest.prediction < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_calibration(cf_cp_s, vcov.type = "HC3")
##
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
##
## Estimate Std. Error t value
## mean.forest.prediction 1.00424 0.10948 9.1730
## differential.forest.prediction 2.02009 0.26378 7.6582
## Pr(>t)
## mean.forest.prediction < 0.00000000000000022 ***
## differential.forest.prediction 0.00000000000001141 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
test_calibration(cf_cp_cond_s, vcov.type = "HC3")
##
## Best linear fit using forest predictions (on held-out data)
## as well as the mean forest prediction as regressors, along
## with one-sided heteroskedasticity-robust (HC3) SEs:
##
## Estimate Std. Error t value Pr(>t)
## mean.forest.prediction 1.05108 0.31592 3.3271 0.0004518 ***
## differential.forest.prediction 1.33736 0.31251 4.2794 0.0000101 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# viewed
vi.v = t(variable_importance(cf_cp_v))
colnames(vi.v) = names(X.cp.train)
barplot(vi.v, horiz = T, las = 2, cex.axis = .5, cex.names = .5,
main = "Variable Importance for View Rate")
# saving_response
vi.s = t(variable_importance(cf_cp_s))
colnames(vi.s) = names(X.cp.train)
barplot(vi.s, horiz = T, las = 2, cex.axis = .5, cex.names = .5,
main = "Variable Importance for Saving Rate")
# conditional saving_response
vi.cond.s = t(variable_importance(cf_cp_cond_s))
colnames(vi.cond.s) = names(X.cp.cond.train)
barplot(vi.cond.s, horiz = T, las = 2, cex.axis = .5, cex.names = .5,
main = "Variable Importance for Conditional Saving Rate")
# pick important vars (vi > median)
impvar.v = colnames(vi.v)[vi.v>median(vi.v)]
print(impvar.v)
## [1] "goal_length" "week_target"
## [3] "starting_capital" "age"
## [5] "tenure" "category_FINANCIAL_FREEDOM"
## [7] "category_HOME_LIVING" "category_LIFE_EVENTS_PRESENTS"
## [9] "category_SPORTS_EQUIPMENT" "category_TRAVEL_HOLIDAY"
## [11] "category_UNKNOWN" "saving_mode_AUTOPILOT"
## [13] "saving_mode_MANUAL"
impvar.s = colnames(vi.s)[vi.s>median(vi.s)]
print(impvar.s)
## [1] "goal_length" "week_target"
## [3] "starting_capital" "age"
## [5] "tenure" "category_FINANCIAL_FREEDOM"
## [7] "category_LEISURE_ENTERTAINMENT" "category_MOBILITY"
## [9] "category_SPORTS_EQUIPMENT" "saving_mode_AUTOPILOT"
## [11] "saving_mode_MANUAL" "gender_FEMALE"
## [13] "gender_MALE"
impvar.cond.s = colnames(vi.cond.s)[vi.s>median(vi.cond.s)]
print(impvar.cond.s)
## [1] "goal_length" "week_target"
## [3] "starting_capital" "age"
## [5] "tenure" "category_FINANCIAL_FREEDOM"
## [7] "category_HOME_ENTERTAINMENT" "category_LEISURE_ENTERTAINMENT"
## [9] "category_MOBILITY" "category_SPORTS_EQUIPMENT"
## [11] "category_TRAVEL_HOLIDAY" "saving_mode_AUTOPILOT"
## [13] "saving_mode_MANUAL" "gender_FEMALE"
## [15] "gender_MALE" "country_AT"
# evaluating heterogeneity with important var
## viewed
best_linear_projection(cf_cp_v,
cp_train[, impvar.v],
vcov.type = "HC3")
##
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.3949048924 0.2945303330 -1.3408 0.18005
## goal_length 0.0154192143 0.0180787332 0.8529 0.39376
## week_target -0.0000149135 0.0000136408 -1.0933 0.27432
## starting_capital 0.0000023195 0.0000050749 0.4571 0.64765
## age 0.0049888455 0.0055451628 0.8997 0.36834
## tenure 0.0042192594 0.1201494592 0.0351 0.97199
## category_FINANCIAL_FREEDOM -0.6441629371 0.3428497382 -1.8788 0.06033 .
## category_HOME_LIVING 0.4363051241 0.2050044017 2.1283 0.03337 *
## category_LIFE_EVENTS_PRESENTS -0.3002309699 0.3597146834 -0.8346 0.40397
## category_SPORTS_EQUIPMENT 0.3273064313 0.4781860602 0.6845 0.49371
## category_TRAVEL_HOLIDAY -0.0960605632 0.1680326108 -0.5717 0.56757
## category_UNKNOWN 0.0292219999 0.1791152537 0.1631 0.87041
## saving_mode_AUTOPILOT -0.0940320474 0.1380621669 -0.6811 0.49585
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## saving_response
best_linear_projection(cf_cp_s,
cp_train[, impvar.s],
vcov.type = "HC3")
##
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1894794864 0.1368842825 1.3842 0.1664
## goal_length -0.0022500162 0.0036283735 -0.6201 0.5352
## week_target -0.0000193091 0.0000120154 -1.6070 0.1081
## starting_capital 0.0000011541 0.0000012699 0.9088 0.3635
## age -0.0041300447 0.0026393695 -1.5648 0.1177
## tenure -0.0057171328 0.0576847953 -0.0991 0.9211
## category_FINANCIAL_FREEDOM -0.1768670494 0.1621538602 -1.0907 0.2754
## category_LEISURE_ENTERTAINMENT 0.0891049173 0.3104154032 0.2871 0.7741
## category_MOBILITY -0.0888940653 0.1013703991 -0.8769 0.3806
## category_SPORTS_EQUIPMENT -0.1392654732 0.2334292115 -0.5966 0.5508
## saving_mode_AUTOPILOT -0.0129323100 0.0642830141 -0.2012 0.8406
## gender_FEMALE 0.0349714714 0.0658781654 0.5309 0.5955
## conditional saving_response
best_linear_projection(cf_cp_cond_s,
cp_cond_train[, impvar.cond.s],
vcov.type = "HC3")
##
## Best linear projection of the conditional average treatment effect.
## Confidence intervals are cluster- and heteroskedasticity-robust (HC3):
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.15420761843 0.11427044619 1.3495 0.17743
## goal_length -0.00150482275 0.00349037642 -0.4311 0.66645
## week_target -0.00000652300 0.00000465649 -1.4008 0.16152
## starting_capital 0.00000011021 0.00000137807 0.0800 0.93627
## age -0.00566523773 0.00234571654 -2.4151 0.01588 *
## tenure 0.03593254710 0.08368361753 0.4294 0.66772
## category_FINANCIAL_FREEDOM -0.08731065694 0.09255146505 -0.9434 0.34568
## category_HOME_ENTERTAINMENT 0.13722874207 0.14424629566 0.9514 0.34162
## category_LEISURE_ENTERTAINMENT 0.34774970064 0.18580622215 1.8716 0.06151 .
## category_MOBILITY -0.04600418635 0.09162355395 -0.5021 0.61569
## category_SPORTS_EQUIPMENT 0.04338853191 0.07229981544 0.6001 0.54854
## category_TRAVEL_HOLIDAY -0.03840158337 0.05090036656 -0.7544 0.45073
## saving_mode_AUTOPILOT -0.00137652559 0.04624970190 -0.0298 0.97626
## gender_FEMALE -0.00026143372 0.04567575247 -0.0057 0.99543
## country_AT -0.08564785026 0.05424386784 -1.5789 0.11461
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# plot CATE
## view rate
par(mfrow=c(3,4))
for (k in 1:27) {
X.cp.partial = matrix(0, 101, ncol(X.cp))
X.cp.partial[, k] = seq(min(X.cp[, k]), max(X.cp[, k]), length.out = 101)
tau_hat_cp_test_v = predict(cf_cp_v, X.cp.partial, estimate.variance = TRUE)
sigma_hat_v = sqrt(tau_hat_cp_test_v$variance.estimates)
plot(X.cp.partial[, k], tau_hat_cp_test_v$predictions,
ylim = range(tau_hat_cp_test_v$predictions + sigma_hat_v, tau_hat_cp_test_v$predictions - sigma_hat_v, 0, 0.01),
xlab = colnames(X.cp)[k], ylab = "CATE", type = "l")
lines(X.cp.partial[, k], tau_hat_cp_test_v$predictions + sigma_hat_v,
col = 1, lty = 2)
lines(X.cp.partial[, k], tau_hat_cp_test_v$predictions - sigma_hat_v,
col = 1, lty = 2)
abline(h = 0, col = 2)}
## saving rate
par(mfrow=c(3,4))
for (k in 1:27) {
X.cp.partial = matrix(0, 101, ncol(X.cp))
X.cp.partial[, k] = seq(min(X.cp[, k]), max(X.cp[, k]), length.out = 101)
tau_hat_cp_test_s = predict(cf_cp_s, X.cp.partial, estimate.variance = TRUE)
sigma_hat_s = sqrt(tau_hat_cp_test_s$variance.estimates)
plot(X.cp.partial[, k], tau_hat_cp_test_s$predictions,
ylim = range(tau_hat_cp_test_s$predictions + sigma_hat_s, tau_hat_cp_test_s$predictions - sigma_hat_s, 0, 0.01),
xlab = colnames(X.cp)[k], ylab = "CATE", type = "l")
lines(X.cp.partial[, k], tau_hat_cp_test_s$predictions + sigma_hat_s,
col = 1, lty = 2)
lines(X.cp.partial[, k], tau_hat_cp_test_s$predictions - sigma_hat_s,
col = 1, lty = 2)
abline(h = 0, col = 2)}
## conditional saving rate
par(mfrow=c(3,4))
for (k in 1:27) {
X.cp.cond.partial = matrix(0, 101, ncol(X.cp.cond))
X.cp.cond.partial[, k] = seq(min(X.cp.cond[, k]), max(X.cp.cond[, k]), length.out = 101)
tau_hat_cp_cond_test_s = predict(cf_cp_cond_s, X.cp.cond.partial, estimate.variance = TRUE)
sigma_hat_cond_s = sqrt(tau_hat_cp_cond_test_s$variance.estimates)
plot(X.cp.cond.partial[, k], tau_hat_cp_cond_test_s$predictions,
ylim = range(tau_hat_cp_cond_test_s$predictions + sigma_hat_cond_s, tau_hat_cp_cond_test_s$predictions -
sigma_hat_cond_s, 0, 0.01),
xlab = colnames(X.cp.cond)[k], ylab = "CATE", type = "l")
lines(X.cp.cond.partial[, k], tau_hat_cp_cond_test_s$predictions + sigma_hat_cond_s,
col = 1, lty = 2)
lines(X.cp.cond.partial[, k], tau_hat_cp_cond_test_s$predictions - sigma_hat_cond_s,
col = 1, lty = 2)
abline(h = 0, col = 2)}
# view rate
Y.cp.test.v = cp_test$viewed
W.cp.test = cp_test$treat
seg_cf_s1_v = tau_hat_cp_v < 0 # control
seg_cf_s2_v = tau_hat_cp_v >= 0 # percentage
optimal_allocation_v = c(sum(seg_cf_s1_v),
sum(seg_cf_s2_v)) / length(Y.cp.test.v)
print(optimal_allocation_v)
## [1] 0.7438498 0.2561502
random_allocation_v = mean(Y.cp.test.v)
statusquo_allocation_v = mean(Y.cp.test.v[W.cp.test == 0])
undifferentiated_allocation_v = mean(Y.cp.test.v[W.cp.test == 1])
optimal_allocation_v = mean(c(Y.cp.test.v[seg_cf_s1_v & W.cp.test == 0],
Y.cp.test.v[seg_cf_s2_v & W.cp.test == 1]))
offline_evaluation_v = round(rbind(
random_allocation_v,
statusquo_allocation_v,
undifferentiated_allocation_v,
optimal_allocation_v), round(3))
colnames(offline_evaluation_v) = "View rate"
rownames(offline_evaluation_v) = c("Status Quo",
"Random",
"Undifferentiated",
"Optimized with CF")
print(offline_evaluation_v)
## View rate
## Status Quo 0.261
## Random 0.276
## Undifferentiated 0.238
## Optimized with CF 0.359
# saving rate
Y.cp.test.s = cp_test$saving_response
W.cp.test = cp_test$treat
seg_cf_s1_s = tau_hat_cp_s < 0 # control
seg_cf_s2_s = tau_hat_cp_s >= 0 # percentage
optimal_allocation_s = c(sum(seg_cf_s1_s),
sum(seg_cf_s2_s)) / length(Y.cp.test.s)
print(optimal_allocation_s)
## [1] 0.98683643 0.01316357
random_allocation_s = mean(Y.cp.test.s)
statusquo_allocation_s = mean(Y.cp.test.s[W.cp.test == 0])
undifferentiated_allocation_s = mean(Y.cp.test.s[W.cp.test == 1])
optimal_allocation_s = mean(c(Y.cp.test.s[seg_cf_s1_s & W.cp.test == 0],
Y.cp.test.s[seg_cf_s2_s & W.cp.test == 1]))
offline_evaluation_s = round(rbind(
random_allocation_s,
statusquo_allocation_s,
undifferentiated_allocation_s,
optimal_allocation_s), round(3))
colnames(offline_evaluation_s) = "Saving rate"
rownames(offline_evaluation_s) = c("Status Quo",
"Random",
"Undifferentiated",
"Optimized with CF")
print(offline_evaluation_s)
## Saving rate
## Status Quo 0.047
## Random 0.061
## Undifferentiated 0.025
## Optimized with CF 0.062
# conditional saving rate
Y.cp.cond.test.s = cp_cond_test$saving_response
W.cp.cond.test = cp_cond_test$treat
seg_cf_s1_cond_s = tau_hat_cp_cond_s < 0 # control
seg_cf_s2_cond_s = tau_hat_cp_cond_s >= 0 # percentage
optimal_allocation_cond_s = c(sum(seg_cf_s1_cond_s),
sum(seg_cf_s2_cond_s)) / length(Y.cp.cond.test.s)
print(optimal_allocation_cond_s)
## [1] 0.8527508 0.1472492
random_allocation_cond_s = mean(Y.cp.cond.test.s)
statusquo_allocation_cond_s = mean(Y.cp.cond.test.s[W.cp.cond.test == 0])
undifferentiated_allocation_cond_s = mean(Y.cp.cond.test.s[W.cp.cond.test == 1])
optimal_allocation_cond_s = mean(c(Y.cp.cond.test.s[seg_cf_s1_cond_s & W.cp.cond.test == 0],
Y.cp.cond.test.s[seg_cf_s2_cond_s & W.cp.cond.test == 1]))
offline_evaluation_cond_s = round(rbind(
random_allocation_cond_s,
statusquo_allocation_cond_s,
undifferentiated_allocation_cond_s,
optimal_allocation_cond_s), round(3))
colnames(offline_evaluation_cond_s) = "Conditional saving rate"
rownames(offline_evaluation_cond_s) = c("Status Quo",
"Random",
"Undifferentiated",
"Optimized with CF")
print(offline_evaluation_cond_s)
## Conditional saving rate
## Status Quo 0.192
## Random 0.243
## Undifferentiated 0.091
## Optimized with CF 0.248